QOL Healthcare Utilization (VSURF and Analysis of Deviance)

Author

Miguel Fudolig

Code
library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)

Setup

Code
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"),
         Religion=relevel(Religion,ref="None")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |> 
  mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
         across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
         across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
         `Primary Language` = as.factor(`Primary Language`))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
Code
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Unmet Health Need

VSURF Approach

Important

We will be using the interpretation set to run an analysis of deviance to check model performance.

Code
rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Unmet.Health.Need~.,
                          data=rfdata,
                          seed=3)$data

# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Unmet.Health.Need~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod

VSURF(Unmet.Health.Need~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Unmet.Health.Need ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 9.8 secs 

 VSURF selected: 
    18 variables at thresholding step (in 4.6 secs)
    6 variables at interpretation step (in 2.7 secs)
    1 variables at prediction step (in 2.5 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "English.Speaking"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "English.Speaking"      "Duration.of.Residency" "Age"                  
[4] "Ethnicity"             "Discrimination"        "English.Difficulties" 
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.1033941

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                English.Speaking    0.01272       0.00044 black
2           Duration.of.Residency    0.01109       0.00060 black
3                             Age    0.00999       0.00051 black
4                       Ethnicity    0.00871       0.00049   red
5                  Discrimination    0.00748       0.00052 black
6            English.Difficulties    0.00654       0.00035 black
7                Dental.Insurance    0.00629       0.00044 black
8                        Religion    0.00515       0.00055 black
9            Full.Time.Employment    0.00275       0.00027 black
10                      Belonging    0.00236       0.00046 black
11                  Income_median    0.00222       0.00031 black
12               Health.Insurance    0.00210       0.00031 black
13            Identify.Ethnically    0.00196       0.00029 black
14               Primary.Language    0.00186       0.00026 black
15       Familiarity.with.America    0.00173       0.00034 black
16                        US.Born    0.00144       0.00015 black
17                         Gender    0.00065       0.00023 black
18 Familiarity.with.Ethnic.Origin    0.00060       0.00030 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_unmethealth.png", width=6, height=4.5,units="in")

Analysis of Deviance Using the Interpretation Set

Code
all_formula <- Unmet.Health.Need~.
pred_vars <- names(rfdata[,-1])[vsurf.mod$varselect.interp]


pred_vars <- ifelse("Ethnicity" %in% pred_vars, pred_vars,c(pred_vars,"Ethnicity"))
mod_form <- reformulate(pred_vars, response="Unmet.Health.Need")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Health.Need")

options(contrasts = c("contr.treatment","contr.poly"))
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Unmet.Health.Need ~ English.Speaking
Model 2: Unmet.Health.Need ~ English.Speaking
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      1970     1243.3                     
2      1970     1243.3  0        0         
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1273.609        1273.609
Code
car::Anova(mod1, test.statistic = "F")
Analysis of Deviance Table (Type II tests)

Response: Unmet.Health.Need
Error estimate based on Pearson residuals 

                  Sum Sq   Df F value   Pr(>F)    
English.Speaking   68.93    3   22.93 1.37e-14 ***
Residuals        1974.00 1970                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)                -1.1550     0.2391  -4.830 1.36e-06 ***
English.SpeakingNot well   -0.3543     0.2672  -1.326    0.185    
English.SpeakingVery well  -1.7448     0.2879  -6.060 1.36e-09 ***
English.SpeakingWell       -1.1893     0.2783  -4.273 1.93e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1312.2  on 1973  degrees of freedom
Residual deviance: 1243.3  on 1970  degrees of freedom
AIC: 1251.3

Number of Fisher Scoring iterations: 5

Unmet Dental Need

VSURF Approach

Code
rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

# imbal <- ROSE::ROSE(Unmet.Health.Need~.,
#                           data=rfdata,
#                           seed=3)$data

# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Unmet.Health.Need~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod

VSURF(Unmet.Dental.Needs~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Unmet.Dental.Needs ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 7.5 secs 

 VSURF selected: 
    17 variables at thresholding step (in 4.6 secs)
    1 variables at interpretation step (in 2.5 secs)
    1 variables at prediction step (in 0.4 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Dental.Insurance"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Dental.Insurance"
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.1146596

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.01348       0.00064 black
2                             Age    0.01004       0.00060 black
3                English.Speaking    0.00917       0.00067 black
4           Duration.of.Residency    0.00865       0.00065 black
5                        Religion    0.00721       0.00048 black
6                   Income_median    0.00542       0.00039 black
7                       Ethnicity    0.00404       0.00048   red
8        Familiarity.with.America    0.00361       0.00041 black
9                Primary.Language    0.00345       0.00030 black
10           English.Difficulties    0.00318       0.00058 black
11           Full.Time.Employment    0.00196       0.00033 black
12                        US.Born    0.00153       0.00016 black
13 Familiarity.with.Ethnic.Origin    0.00152       0.00034 black
14               Health.Insurance    0.00138       0.00033 black
15                 Discrimination    0.00092       0.00031 black
16            Identify.Ethnically    0.00074       0.00031 black
17                      Belonging    0.00064       0.00026 black
18                         Gender   -0.00008       0.00024 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_unmetdental.png", width=6, height=4.5,units="in")
Note

Dental Insurance is the only variable in the interpretation set, which means Ethnicity might not be a significant predictor of unmet dental needs.

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Unmet.Dental.Needs")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Dental.Needs")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Unmet.Dental.Needs ~ Dental.Insurance
Model 2: Unmet.Dental.Needs ~ Dental.Insurance + Ethnicity
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      1966     1300.0                     
2      1961     1291.3  5   8.7216   0.1207
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1       1344.41        1315.208
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -1.53967    0.16386  -9.396   <2e-16 ***
Dental.InsuranceYes   -1.39569    0.15702  -8.889   <2e-16 ***
EthnicityAsian Indian -0.18551    0.24169  -0.768   0.4427    
EthnicityFilipino      0.30009    0.28968   1.036   0.3002    
EthnicityKorean        0.35927    0.21118   1.701   0.0889 .  
EthnicityOther        -0.05362    0.38835  -0.138   0.8902    
EthnicityVietnamese    0.33570    0.21855   1.536   0.1245    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1395.0  on 1967  degrees of freedom
Residual deviance: 1291.3  on 1961  degrees of freedom
AIC: 1305.3

Number of Fisher Scoring iterations: 5
Note

The non-significant analysis of deviance implies that Ethnicity might not be a significant predictor of unmet dental needs.

Physical Check-up

VSURF

Code
rfdata <- qol |> 
  select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

# imbal <- ROSE::ROSE(Physical.Check.up~.,
#                           data=rfdata,
#                           seed=3)$data
# 
# VSURF(Physical.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod

VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Physical.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 12.9 secs 

 VSURF selected: 
    17 variables at thresholding step (in 6.3 secs)
    5 variables at interpretation step (in 3.4 secs)
    2 variables at prediction step (in 3.2 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Duration.of.Residency" "Health.Insurance"     
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Duration.of.Residency" "Age"                   "Health.Insurance"     
[4] "Dental.Insurance"      "Income_median"        
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.2817396
Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1           Duration.of.Residency    0.02978       0.00085 black
2                             Age    0.02682       0.00089 black
3                Health.Insurance    0.02257       0.00060 black
4                Dental.Insurance    0.01358       0.00068 black
5                   Income_median    0.00707       0.00048 black
6                       Ethnicity    0.00670       0.00057   red
7                    EnglishSpeak    0.00567       0.00045 black
8                     EnglishDiff    0.00472       0.00067 black
9                      Employment    0.00456       0.00030 black
10                       Religion    0.00413       0.00050 black
11       Familiarity.with.America    0.00268       0.00035 black
12                         Gender    0.00247       0.00040 black
13 Familiarity.with.Ethnic.Origin    0.00223       0.00043 black
14               Primary.Language    0.00116       0.00028 black
15                        US.Born    0.00093       0.00028 black
16                 Discrimination    0.00064       0.00035 black
17            Identify.Ethnically    0.00047       0.00050 black
18                      Belonging    0.00020       0.00040 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_PC.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Physical.Check.up")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Physical.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance + 
    Dental.Insurance + Income_median
Model 2: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance + 
    Dental.Insurance + Income_median + Ethnicity
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      1960     2152.4                          
2      1955     2115.6  5   36.766 6.673e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      2199.048        2197.895
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -2.002804   0.225453  -8.883  < 2e-16 ***
Duration.of.Residency  0.032732   0.005457   5.998 2.00e-09 ***
Age                    0.019095   0.003837   4.976 6.48e-07 ***
Health.InsuranceYes    1.190926   0.161703   7.365 1.77e-13 ***
Dental.InsuranceYes    0.544998   0.126932   4.294 1.76e-05 ***
Income_medianAbove     0.370270   0.119702   3.093  0.00198 ** 
EthnicityAsian Indian  0.097049   0.159050   0.610  0.54174    
EthnicityFilipino      0.435205   0.216839   2.007  0.04474 *  
EthnicityKorean       -0.486941   0.157210  -3.097  0.00195 ** 
EthnicityOther        -0.259744   0.248770  -1.044  0.29643    
EthnicityVietnamese    0.438791   0.173259   2.533  0.01132 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2466.2  on 1965  degrees of freedom
Residual deviance: 2115.6  on 1955  degrees of freedom
AIC: 2137.6

Number of Fisher Scoring iterations: 4

Dental Check-up

VSURF

Code
rfdata <- qol |> 
  select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

# imbal <- ROSE::ROSE(Physical.Check.up~.,
#                           data=rfdata,
#                           seed=3)$data
# 
# VSURF(Physical.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod

VSURF(Dentist.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Dentist.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 10 secs 

 VSURF selected: 
    18 variables at thresholding step (in 5.5 secs)
    3 variables at interpretation step (in 3.1 secs)
    3 variables at prediction step (in 1.4 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Dental.Insurance"      "Duration.of.Residency" "Religion"             
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Dental.Insurance"      "Duration.of.Residency" "Religion"             
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.2958695
Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.05153       0.00118 black
2           Duration.of.Residency    0.04922       0.00089 black
3                        Religion    0.01600       0.00076 black
4                       Ethnicity    0.01322       0.00082   red
5                             Age    0.01149       0.00064 black
6                Health.Insurance    0.00828       0.00042 black
7                    EnglishSpeak    0.00781       0.00060 black
8                   Income_median    0.00704       0.00055 black
9             Identify.Ethnically    0.00395       0.00053 black
10                    EnglishDiff    0.00328       0.00062 black
11 Familiarity.with.Ethnic.Origin    0.00287       0.00047 black
12       Familiarity.with.America    0.00260       0.00039 black
13                      Belonging    0.00222       0.00055 black
14                         Gender    0.00218       0.00046 black
15                     Employment    0.00212       0.00020 black
16                        US.Born    0.00177       0.00021 black
17               Primary.Language    0.00172       0.00035 black
18                 Discrimination    0.00087       0.00024 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_DC.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Dentist.Check.up")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Dentist.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency + 
    Religion
Model 2: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency + 
    Religion + Ethnicity
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      1952     2209.7                     
2      1947     2201.0  5   8.7418   0.1198
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      2307.098        2277.933
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.975528   0.152696  -6.389 1.67e-10 ***
Dental.InsuranceYes    1.609345   0.108703  14.805  < 2e-16 ***
Duration.of.Residency  0.045686   0.004692   9.738  < 2e-16 ***
ReligionBuddhist       0.246415   0.205398   1.200   0.2303    
ReligionCatholic       0.126604   0.202056   0.627   0.5309    
ReligionHindu         -0.292057   0.326191  -0.895   0.3706    
ReligionMuslim        -0.251392   0.402636  -0.624   0.5324    
ReligionOther         -0.522105   0.423661  -1.232   0.2178    
ReligionProtestant     0.003237   0.170527   0.019   0.9849    
EthnicityAsian Indian -0.498288   0.312440  -1.595   0.1108    
EthnicityFilipino     -0.075560   0.238626  -0.317   0.7515    
EthnicityKorean       -0.414204   0.173855  -2.382   0.0172 *  
EthnicityOther        -0.318398   0.268028  -1.188   0.2349    
EthnicityVietnamese   -0.331430   0.187937  -1.764   0.0778 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2659.6  on 1960  degrees of freedom
Residual deviance: 2201.0  on 1947  degrees of freedom
AIC: 2229

Number of Fisher Scoring iterations: 4

Folk medicine

VSURM

Code
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

# imbal <- ROSE::ROSE(Folkmedicine~.,
#                           data=rfdata,
#                           seed=3)$data

# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod

VSURF(Folkmedicine~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Folkmedicine ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 9.2 secs 

 VSURF selected: 
    17 variables at thresholding step (in 4.7 secs)
    3 variables at interpretation step (in 2.7 secs)
    1 variables at prediction step (in 1.8 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Age"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Age"                   "Duration.of.Residency" "Ethnicity"            
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.1382383

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                             Age    0.01716       0.00066 black
2           Duration.of.Residency    0.01315       0.00055 black
3                       Ethnicity    0.01311       0.00065   red
4                English.Speaking    0.00899       0.00058 black
5                        Religion    0.00684       0.00066 black
6                   Income_median    0.00549       0.00036 black
7            English.Difficulties    0.00472       0.00043 black
8                Primary.Language    0.00262       0.00026 black
9  Familiarity.with.Ethnic.Origin    0.00262       0.00032 black
10       Familiarity.with.America    0.00260       0.00035 black
11           Full.Time.Employment    0.00235       0.00042 black
12                 Discrimination    0.00127       0.00034 black
13               Dental.Insurance    0.00115       0.00023 black
14               Health.Insurance    0.00088       0.00020 black
15            Identify.Ethnically    0.00070       0.00037 black
16                        US.Born    0.00067       0.00014 black
17                         Gender    0.00055       0.00026 black
18                      Belonging    0.00013       0.00033 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_AltMedicine.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp])
mod_form <- reformulate(pred_vars, response="Folkmedicine")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Folkmedicine")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Folkmedicine ~ Age + Duration.of.Residency
Model 2: Folkmedicine ~ Age + Duration.of.Residency + Ethnicity
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      1944     1507.7                          
2      1939     1450.9  5    56.75 5.693e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1511.497        1530.377
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -2.612602   0.219844 -11.884  < 2e-16 ***
Age                    0.022611   0.004636   4.877 1.08e-06 ***
Duration.of.Residency  0.007417   0.005960   1.244 0.213371    
EthnicityAsian Indian -0.809613   0.218076  -3.713 0.000205 ***
EthnicityFilipino     -0.742468   0.272122  -2.728 0.006364 ** 
EthnicityKorean        0.234543   0.173006   1.356 0.175197    
EthnicityOther        -0.735426   0.357019  -2.060 0.039408 *  
EthnicityVietnamese   -1.084595   0.232545  -4.664 3.10e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1552.9  on 1946  degrees of freedom
Residual deviance: 1450.9  on 1939  degrees of freedom
AIC: 1466.9

Number of Fisher Scoring iterations: 5